In [1]:
%matplotlib inline

In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from plotnine import *

In [3]:
!ls -lah ../data/*csv


-rw-rw-r-- 1 ilya ilya 1.6K Apr 19 11:05 ../data/2017-03-09_NextSeq.csv
-rw-rw-r-- 1 ilya ilya 457K Oct 26  2016 ../data/dfa_mp.offset_150.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26  2016 ../data/dfa_mp.offset_150.win_200.csv
-rw-rw-r-- 1 ilya ilya 453K Oct 26  2016 ../data/dfa_mp.offset_150.win_50.csv
-rw-rw-r-- 1 ilya ilya 455K Oct 26  2016 ../data/dfa_mp.offset_150.win_80.csv
-rw-rw-r-- 1 ilya ilya 457K Oct 26  2016 ../data/dfa_mp.offset_200.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26  2016 ../data/dfa_mp.offset_200.win_200.csv
-rw-rw-r-- 1 ilya ilya 454K Oct 26  2016 ../data/dfa_mp.offset_200.win_50.csv
-rw-rw-r-- 1 ilya ilya 456K Oct 26  2016 ../data/dfa_mp.offset_200.win_80.csv
-rw-rw-r-- 1 ilya ilya 457K Oct 26  2016 ../data/dfa_mp.offset_300.win_100.csv
-rw-rw-r-- 1 ilya ilya 461K Oct 26  2016 ../data/dfa_mp.offset_300.win_200.csv
-rw-rw-r-- 1 ilya ilya 454K Oct 26  2016 ../data/dfa_mp.offset_300.win_50.csv
-rw-rw-r-- 1 ilya ilya 455K Oct 26  2016 ../data/dfa_mp.offset_300.win_80.csv
-rw-rw-r-- 1 ilya ilya 603K Jul 16  2016 ../data/dfm.csv
-rw-rw-r-- 1 ilya ilya 2.5K Apr 19 11:05 ../data/Lexogen_Sense_RNA-Seq.csv
-rw-rw-r-- 1 ilya ilya 3.3M Apr 19 11:05 ../data/utr.counts.csv

Load the data


In [4]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'

output = []

for offset in offsets:
    for winsize in winsizes:
        df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
        df['win'] = winsize
        df['offset'] = offset
        output.append(df)
        
dfa = pd.concat(output)

In [5]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa


Out[5]:
TSS end_x start_x gene strand_x end_y start_y strand_y strand ratio_ATCACG ratio_ACAGTG ratio_CGATGT ratio_GCCAAT win offset UTR_length
0 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
1 148 190 148 thrL + 255.0 190.0 + + 3.000000 2.784355 0.911828 3.178117 50 150 42
2 5030 5234 5030 yaaX + 5530.0 5234.0 + + 4.576923 6.983333 1.264901 1.436242 50 150 204
3 6587 6587 6459 yaaA - 6459.0 5683.0 - - 0.032028 0.072193 0.567568 0.600000 50 150 128
4 6615 6615 6459 yaaA - 6459.0 5683.0 - - 0.034091 0.090379 0.654135 0.582011 50 150 156
5 8017 8017 7959 yaaJ - 7959.0 6529.0 - - 0.875000 0.571429 0.885246 1.196262 50 150 58
6 8191 8238 8191 talB + 9191.0 8238.0 + + 0.478825 0.513356 0.473950 0.564393 50 150 47
9 11542 11542 11356 yaaW - 11356.0 10643.0 - - 0.666667 1.777778 1.327273 1.012658 50 150 186
10 11825 11825 11786 yaaI - 11786.0 11382.0 - - 0.500000 2.625000 0.652330 0.474874 50 150 39
11 11913 11913 11786 yaaI - 11786.0 11382.0 - - 0.333333 0.555556 1.748148 1.713376 50 150 127
12 11938 11938 11786 yaaI - 11786.0 11382.0 - - 0.857143 0.428571 1.100592 1.442623 50 150 152
13 12048 12163 12048 dnaK + 14079.0 12163.0 + + 0.252212 0.207481 0.171599 0.301158 50 150 115
14 12123 12163 12123 dnaK + 14079.0 12163.0 + + 0.869191 0.539653 0.430504 1.010352 50 150 40
15 12144 12163 12144 dnaK + 14079.0 12163.0 + + 0.979294 0.717621 0.513948 1.066012 50 150 19
18 16951 16951 16903 hokC - 16903.0 16751.0 - - 0.478261 0.569767 0.599631 0.459902 50 150 48
19 17317 17489 17317 nhaA + 18655.0 17489.0 + + 0.052632 0.126904 2.822222 1.647166 50 150 172
20 17458 17489 17458 nhaA + 18655.0 17489.0 + + 1.067073 1.989583 0.762238 1.602339 50 150 31
21 21120 21120 21078 rpsT - 21078.0 20815.0 - - 0.752518 0.615503 0.493768 0.752228 50 150 42
22 21210 21210 21078 rpsT - 21078.0 20815.0 - - 0.278619 0.579581 0.220507 0.358928 50 150 132
23 21383 21407 21383 ribF + 22348.0 21407.0 + + 0.922207 1.056693 0.849432 0.966921 50 150 24
24 21833 22391 21833 ileS + 25207.0 22391.0 + + 1.352113 1.040936 1.098859 1.163180 50 150 558
25 22034 22391 22034 ileS + 25207.0 22391.0 + + 0.528970 0.743542 0.934363 0.388699 50 150 357
26 22229 22391 22229 ileS + 25207.0 22391.0 + + 0.418221 0.240061 0.299776 0.510862 50 150 162
27 25014 25207 25014 lspA + 25701.0 25207.0 + + 0.850227 0.498730 0.592040 0.854137 50 150 193
28 28288 28374 28288 dapB + 29195.0 28374.0 + + 0.544828 1.341176 0.757576 0.496063 50 150 86
29 28343 28374 28343 dapB + 29195.0 28374.0 + + 1.752809 1.933333 1.785714 1.243902 50 150 31
30 29551 29651 29551 carA + 30799.0 29651.0 + + 0.790000 0.430233 0.240310 0.424419 50 150 100
31 29619 29651 29619 carA + 30799.0 29651.0 + + 0.788462 0.435897 0.461957 0.466912 50 150 32
32 30775 30817 30775 carB + 34038.0 30817.0 + + 0.513514 0.761194 0.406593 1.136000 50 150 42
33 34218 34300 34218 caiF + 34695.0 34300.0 + + 0.764706 1.388889 0.357143 0.403846 50 150 82
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3754 4609344 4609414 4609344 prfC + 4611003.0 4609414.0 + + 1.043222 0.723374 0.815589 1.112554 200 300 70
3755 4609356 4609414 4609356 prfC + 4611003.0 4609414.0 + + 1.113715 0.751312 0.856031 1.154138 200 300 58
3756 4611153 4611396 4611153 osmY + 4612001.0 4611396.0 + + 1.070175 1.486726 0.866915 0.928707 200 300 243
3757 4616679 4617323 4616679 deoC + 4618102.0 4617323.0 + + 1.140625 0.526882 1.102041 0.934307 200 300 644
3758 4617278 4617323 4617278 deoC + 4618102.0 4617323.0 + + 1.826840 2.649815 1.678423 2.319249 200 300 45
3759 4619567 4619603 4619567 deoB + 4620826.0 4619603.0 + + 0.520743 0.548993 0.379441 0.564190 200 300 36
3760 4621657 4621769 4621657 yjjJ + 4623100.0 4621769.0 + + 3.920833 14.337209 1.594747 1.465487 200 300 112
3761 4621716 4621769 4621716 yjjJ + 4623100.0 4621769.0 + + 1.156682 2.305970 0.783037 0.875229 200 300 53
3762 4624238 4624238 4624117 lplA - 4624117.0 4623101.0 - - 2.214286 1.905263 1.272436 1.488294 200 300 121
3763 4624799 4624799 4624789 ytjB - 4624789.0 4624145.0 - - 1.145985 1.015267 0.684332 0.680734 200 300 10
3764 4624856 4624895 4624856 serB + 4625863.0 4624895.0 + + 1.471910 2.332288 1.002410 1.592798 200 300 39
3765 4630566 4630566 4630522 yjjK - 4630522.0 4628855.0 - - 0.975590 1.301775 0.613567 0.892770 200 300 44
3766 4630700 4630733 4630700 slt + 4632670.0 4630733.0 + + 0.843023 0.823171 0.951872 0.894191 200 300 33
3767 4632704 4632760 4632704 trpR + 4633086.0 4632760.0 + + 1.302372 1.444231 0.835112 0.975930 200 300 56
3768 4633773 4633773 4633745 yjjX - 4633745.0 4633233.0 - - 3.361868 3.631399 1.012745 1.262749 200 300 28
3769 4633899 4633899 4633745 yjjX - 4633745.0 4633233.0 - - 3.716738 5.043478 1.026196 1.508820 200 300 154
3770 4635243 4635521 4635243 creA + 4635994.0 4635521.0 + + 2.986159 2.272251 1.123288 1.083676 200 300 278
3771 4635353 4635353 4635310 rob - 4635310.0 4634441.0 - - 0.902421 0.560804 0.475946 0.853982 200 300 43
3772 4635477 4635521 4635477 creA + 4635994.0 4635521.0 + + 0.989286 1.091716 0.517182 0.904128 200 300 44
3773 4638160 4638178 4638160 creD + 4639530.0 4638178.0 + + 1.642857 3.800000 1.421053 0.748068 200 300 18
3774 4640358 4640402 4640358 yjjY + 4640542.0 4640402.0 + + 13.010830 11.512545 14.250000 7.067416 200 300 44
3775 4640508 4640508 4640306 arcA - 4640306.0 4639590.0 - - 1.163365 0.827256 0.823056 1.180585 200 300 202
3776 4640512 4640512 4640306 arcA - 4640306.0 4639590.0 - - 1.167142 0.837495 0.831858 1.187599 200 300 206
3777 4640535 4640535 4640306 arcA - 4640306.0 4639590.0 - - 1.057403 0.763457 0.804410 1.089905 200 300 229
3778 4640599 4640599 4640306 arcA - 4640306.0 4639590.0 - - 0.867294 0.786859 0.757342 0.862293 200 300 293
3779 4640681 4640681 4640306 arcA - 4640306.0 4639590.0 - - 0.542907 0.452288 0.399267 0.477665 200 300 375
3780 4640688 4640688 4640306 arcA - 4640306.0 4639590.0 - - 0.515849 0.440549 0.386567 0.455797 200 300 382
3781 4640801 4640801 4640306 arcA - 4640306.0 4639590.0 - - 0.089461 0.110785 0.126010 0.168638 200 300 495
3782 4640838 4640942 4640838 yjtD + 4641628.0 4640942.0 + + 1.639535 0.945946 1.095890 2.051546 200 300 104
3783 4640898 4640942 4640898 yjtD + 4641628.0 4640942.0 + + 1.056604 0.763636 0.952381 1.453782 200 300 44

43812 rows × 16 columns

Let's see how it looks like for one particular window and offset value


In [6]:
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])

In [7]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/7.)
print(p)


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (8770817038077)>

In [11]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='mavg')
print(p)


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/plotnine/stats/stat_smooth.py:167: UserWarning: No 'window' specified in the method_args. Using window = 29. The same window is used for all groups or facets
  "facets".format(window))
<ggplot: (-9223363266037642885)>

In [12]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + scale_y_continuous(limits=(-1,2.5))
print(p)


/home/ilya/.venv/pydata3/lib/python3.5/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (8770817089544)>

In [12]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='ma', window=25) \
        + scale_y_continuous(limits=(-1,2.5))
print(p)


/home/ilya/src/ggplot/ggplot/utils/smoothers.py:61: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=25).std()
  std_err = pd.rolling_std(y, window)
/home/ilya/src/ggplot/ggplot/utils/smoothers.py:62: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=25).mean()
  y = pd.rolling_mean(y, window)
<ggplot: (-9223363247313110570)>

In [13]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/17.)
print(p)


<ggplot: (8789541660943)>

In [14]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='ma', window=20)
print(p)


/home/ilya/src/ggplot/ggplot/utils/smoothers.py:61: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=20).std()
  std_err = pd.rolling_std(y, window)
/home/ilya/src/ggplot/ggplot/utils/smoothers.py:62: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=20).mean()
  y = pd.rolling_mean(y, window)
<ggplot: (8789541625955)>

What about other offset and win values?


In [15]:
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)][[
            'TSS', 'gene', 'UTR_length', 
            'ratio_ATCACG','ratio_CGATGT', 'offset', 'win']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])

In [16]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_wrap('win')
print(p)


<ggplot: (-9223363247254408255)>

In [17]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_wrap('offset')
print(p)


<ggplot: (-9223363247311318805)>

In [18]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_grid('offset ~ win')
print(p)


<ggplot: (-9223363247333806669)>

In [19]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='ma', window=20) \
        + facet_grid('offset ~ win')
print(p)


/home/ilya/src/ggplot/ggplot/utils/smoothers.py:61: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=20).std()
  std_err = pd.rolling_std(y, window)
/home/ilya/src/ggplot/ggplot/utils/smoothers.py:62: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,window=20).mean()
  y = pd.rolling_mean(y, window)
<ggplot: (8789541669475)>

Long UTRs


In [20]:
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['UTR_length'] < 600)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']

d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'

_d = pd.concat([d1, d2])

In [21]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
        + geom_point(alpha=0.25) \
        + geom_smooth(method='lowess', span=1/5., size=3) \
        + xlab("5' UTR length") \
        + ylab("log(proximal/distal)") \
        + theme(axis_title=element_text(size=20),
                axis_text=element_text(size=20))
print(p)


<ggplot: (-9223363247311214462)>

In [22]:
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['UTR_length'] < 600)
        & (dfa['ratio_ATCACG'] > 2)][['TSS', 'win', 'offset', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])

d1 = d[['UTR_length', 'win', 'offset', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'win', 'offset', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'

_d = pd.concat([d1, d2])

In [23]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
        + geom_point(alpha=0.25) \
        + geom_smooth(method='lowess', span=1/5., size=3) \
        + xlab("5' UTR length") \
        + ylab("log(proximal/distal)") \
        + theme(axis_title=element_text(size=20),
                axis_text=element_text(size=20)) \
        + facet_grid('offset ~ win')
print(p)


<ggplot: (8789543465003)>

In [ ]: